\documentclass{beamer}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{graphicx}
\usepackage{subfigure}
\title{Gesture Recognition for AI Interaction\\}
\author{Elisabeth Lindquist}


\begin{document}

\begin{frame}
\maketitle
\end{frame}

\begin{frame}
\label{Outline}
\frametitle{Outline}
\begin{itemize}
\item Background and motivation
\item The Kinect device
\item Gesture Recognition
\item Canonical correlation analysis
\item Implementation
\item Results
\item Conclusion
\item Questions
\end{itemize}
\end{frame}


\begin{frame}
\frametitle{Background and motivation}
\label{Background and motivation}
\begin{block}{Motivation}
The purpose of this work is to investigate and implement methods for human-computer interaction using the Kinect device. The implementation was made in cooperation with the Virtual animals project, that is being displayed at Visualiseringscenter C in Norrk\"oping. This is an AI application, where user input may affect the agents behaviour, but not diectly control them.
\end{block}
\end{frame}


\begin{frame}
\label{The Kinect device}
\frametitle{The Kinect device}

	\begin{itemize}
		\item The Microsoft Kinect device is one of the first consumer devices featuring a depth image sensor. 
		\item Its intended use is as a controller for the Microsoft Xbox gaming console. 
		\item An open source driver was developed within a few weeks after its release in November 2010, which allowed for Kinect PC development.
	\end{itemize}
	\begin{figure}
\centering
\includegraphics[width=0.4\textwidth]{kinect.eps}
\end{figure}
\end{frame}

\begin{frame}
\frametitle{The Kinect device} 
\begin{block}{RGB image}
The Kinect features an RGB camera with a resolution of 640x480 pixels. It has a frame rate of approximately 30 frames per second.
\end{block}
\begin{block}{Audio}
The also features four microphones which can be used for audio cues, voice identification and similar applications. In this project audio is not used.
\end{block}
\end{frame}

\begin{frame}
\frametitle{The Kinect device}
\begin{block}{Depth image}
\begin{itemize}
\item Near-infrared structured light is projected onto the scene in a dot pattern
\item Each point is coded using a PrimeSense proprietary algorithm
\item Point distortions are used to calculate the depth
\end{itemize}
The output is an 11-bit image.
\end{block}
\end{frame}

\begin{frame}
\frametitle{The Kinect device}
\begin{block}{Microsoft gesture recognition}
\begin{itemize}
\item Body parts are identified using a per-pixel technique
\item Official SDK announced in February 2011
\end{itemize}
\end{block}
\end{frame}


\begin{frame}
\frametitle{Gesture Recognition}
\begin{block}{Different approaches to gesture control:}
\begin{itemize}
\item Hand gestures
\item Pose estimation
\end{itemize}
\end{block}

\begin{block}{Several methods for gesture recognition:}
\begin{itemize}
\item Vision based, using image processing
\item Other types of sensors
\end{itemize}
\end{block}
\end{frame}

\begin{frame}
\frametitle{Gesture recognition}
When choosing method for gesture recognition the following requirements were considered:
\begin{itemize}
\item Casual interaction
\item Multiple users
\item Using the depth image
\end{itemize}

With this in mind these approaches were chosen:
\begin{itemize}
\item Tracking position of hands and head 
\item Gesture spotting using time frames
\item Canonical correlation analysis for comparision with prototype gestures
\end{itemize}

\end{frame}

\begin{frame}
\frametitle{Canonical correlation analysis}
\begin{itemize}
\item Used in statistical analysis, and more recently to solve signal processing tasks
\item Find basis vectors for two sets of variables, such that when the variables are projected onto these basis vectors their correlation is maximized
\item Invariant to affine transformations, such as rotation, translation and scaling
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Canonical correlation analysis}
Two random variables, x and y, have the total covariance matrix
\begin{equation}
\mathbf{C}= \begin{bmatrix} \mathbf{C}_{xx} &\mathbf{C}_{xy}\\\mathbf{C}_{yx} & \mathbf{C}_{yy}\end{bmatrix}= 
E \begin{bmatrix} \begin{pmatrix} \mathbf{x} \\\mathbf{y} \end{pmatrix} \begin{pmatrix}\mathbf{x} \\\mathbf{y}\end{pmatrix}^T\end{bmatrix}
\end{equation}
They are projected onto two vectors, $\mathbf{w}_x$ and $\mathbf{w}_y$.The function to be maximized is the correlation, $\rho$, between the projections of $x$ and $y$, $\mathbf{x}^T\mathbf{w}_x$ and $\mathbf{y}^T\mathbf{w}_x$:
\begin{align}
\nonumber \rho &= \frac{E\left[xy \right] }{\sqrt{E\left[x^2 \right]E\left[ y^2\right]  }} = 
\frac{E\left[ \mathbf{\hat{w}}_{x}^{T} \mathbf{xy}^T \hat{\mathbf{w}}_y\right]} 
{\sqrt{E\left[ \mathbf{\hat{w}}_{x}^{T} \mathbf{xx}^T \hat{\mathbf{w}}_x\right]
E\left[ \mathbf{\hat{w}}_{y}^{T} \mathbf{yy}^T \hat{\mathbf{w}}_y\right] }} \\
&=\frac{\mathbf{w}_x^T\mathbf{C}_{xy}\mathbf{w}_y}
{\sqrt{	\mathbf{w}_x^T\mathbf{C}_{xx}\mathbf{w}_x
			\mathbf{w}_y^T\mathbf{C}_{yy}\mathbf{w}_y}}
\end{align}

\end{frame}
\begin{frame}
Setting the derivatives to zero gives the equation system

\begin{equation}
\label{eq:system}
\left\{
  \begin{array}{l}
    \mathbf{C}_{xy}\mathbf{\hat{w}_y} = \rho \lambda \mathbf{C}_{xx}\mathbf{\hat{w}_x}\\
    \mathbf{C}_{yx}\mathbf{\hat{w}_x} = \rho \lambda \mathbf{C}_{yy}\mathbf{\hat{w}_y}\\
  \end{array} \right.
\end{equation}
Equation \ref{eq:system} can be rewritten as

\begin{equation}
\label{eq:bleh}
\left\{
  \begin{array}{l}
    \mathbf{C}_{xx}^{-1}\mathbf{C}_{xy}\mathbf{C}_{yy}^{-1}\mathbf{C}_{yx}\mathbf{\hat{w}_x} = \rho^2 \mathbf{\hat{w}_x}\\
    \mathbf{C}_{yy}^{-1}\mathbf{C}_{yx}\mathbf{C}_{xx}^{-1}\mathbf{C}_{xy}\mathbf{\hat{w}_y} = \rho^2 \mathbf{\hat{w}_y}\\
  \end{array} \right.
\end{equation}
From Equation \ref{eq:bleh} $\mathbf{\hat{w}}_x$ and $\mathbf{\hat{w}}_y$ can be found as the eigenvectors of the matrices $\mathbf{C}_{xx}^{-1}\mathbf{C}_{xy}\mathbf{C}_{yy}^{-1}\mathbf{C}_{yx}\mathbf{\hat{w}_x}$ and $\mathbf{C}_{yy}^{-1}\mathbf{C}_{yx}\mathbf{C}_{xx}^{-1}\mathbf{C}_{xy}\mathbf{\hat{w}_y}$. \\
The eigenvalues, $\rho^2$ that correspond to these eigenvectors are the squared maximum canonical correlations. 

\end{frame}



\begin{frame}
\frametitle{Implementation}
The implementation is made in C++, using the OpenKinect driver for handling Kinect data, and OpenCV for image processing tasks. 
\begin{block}{ }
The RGB and depth images are retrieved from the Kinect device:
\begin{figure}[htbp]
  \centering
  \subfigure{\label{fig:12_6}\includegraphics[width=0.4\textwidth]{0_12_6.eps}}             
  \subfigure{\label{fig:12_6d}\includegraphics[width=0.4\textwidth]{0_12_6d.eps}}
  \caption{Images from Kinect device}
  \label{fig:gesture0}
\end{figure}
\end{block}
\end{frame}

\begin{frame}
\frametitle{Implementation}
The depth image is used to remove the background from the RGB image:
\begin{figure}[htbp]
  \centering
  	\includegraphics[width=0.6\textwidth]{fimpen_mask.eps} 
  \caption{Background removal}
  \label{fig:mask}
\end{figure}

\end{frame}

\begin{frame}
\frametitle{Implementation}
People are found in the RGB image using the cvBlobLibrary:
\begin{figure}[htbp]
  \centering
  	\includegraphics[width=0.6\textwidth]{fimpen_detection.eps} 
  \caption{Rectangle drawn around found person}
  \label{fig:detect}
\end{figure}

\end{frame}

\begin{frame}
\frametitle{Implementation}
\begin{itemize}
\item The maximum and minimum positions in the horizontal direction are stored in a list
\item The size of the list may vary, for the testing sesstions 50 values were used
\item Every 10th frame a detection is done, using the last 50 values
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Implementation}
\begin{block}{Training}
If the application is set in training mode, the sequence of estimated hand positions are stored as prototypes. The prototypes are grouped, so that each gesture has a group of several prototypes.
\end{block} 
\begin{block}{Detection}
In detection mode the captured sequence of positions is compared to the previously stored prototypes using CCA. If the maximum correlation is above some threshold, the group were the maximum correlation was found is set to be the gesture that is taking place.
\end{block} 

\end{frame}

\begin{frame}
\frametitle{Implementation}
\begin{block}{AI application}
\begin{itemize}
\item XML-file sent over UDP socket
\item Contains position, speed and gesture for each person found
\end{itemize}
\end{block} 
\end{frame}

\begin{frame}
\frametitle{Implementation}
\begin{block}{Testing application}
\begin{itemize}
\item An application that works with recorded data was developed
\item 5 gestures performed by 12 different people were recorded and used for testing
\end{itemize}
\end{block} 
\end{frame}

\begin{frame}
\frametitle{Implementation}
\begin{figure}[htbp]
  \centering
  	\includegraphics[width=0.4\textwidth]{stick.eps} 
  \caption{Gesture instructions given to test subjects}
  \label{fig:stick}
\end{figure}
\end{frame}


\begin{frame}
\frametitle{Results}
\begin{block}{Testing application}
\begin{itemize}
\item In the testing application a single detection was made
\item In approximately 80\% of the test cases the application correctly detects the gesture
\item Gestures that have similar arm movements are easily confused
\end{itemize}
\end{block}
\end{frame}

\begin{frame}
\frametitle{Results}
\begin{block}{AI application}
\begin{itemize}
\item Poor separation of people standing close to each other
\item Training data set too small and of bad quality
\end{itemize}
\end{block} 
\end{frame}

\begin{frame}
\frametitle{Conclusion}
\begin{itemize}
\item A system for gesture recognition using Canonical correlation analysis was implemented
\item The system can distiguish between gestures, but in order to function in a live application it needs to be more reliable
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Conclusion}
Suggested improvements:
\begin{itemize}
\item Using color to detect hands and head
\item Investigate recorded prototypes to find better features to track, for example by using Principal component analysis.
\end{itemize}
\end{frame}

\begin{frame}
\frametitle{Thank you for listening!}
\begin{center}
\begin{block}{\begin{huge}
Questions?
\end{huge}}
\end{block}
\end{center}
\end{frame}



\end{document}